library(sparklyr)
spark_installed_versions()
Create spark connection:
sc <- spark_connect(master = "local", version = "2.4.5")
Re-using existing Spark connection to local
sc
$master
[1] "local[4]"
$method
[1] "shell"
$app_name
[1] "sparklyr"
$config
$config$spark.env.SPARK_LOCAL_IP.local
[1] "127.0.0.1"
$config$sparklyr.connect.csv.embedded
[1] "^1.*"
$config$spark.sql.legacy.utcTimestampFunc.enabled
[1] TRUE
$config$sparklyr.connect.cores.local
[1] 4
$config$spark.sql.shuffle.partitions.local
[1] 4
$config$`sparklyr.shell.driver-memory`
[1] "2g"
attr(,"config")
[1] "default"
attr(,"file")
[1] "/Library/Frameworks/R.framework/Versions/3.6/Resources/library/sparklyr/conf/config-template.yml"
$state
<environment: 0x7f7f7ad3e748>
$extensions
$extensions$jars
character(0)
$extensions$packages
character(0)
$extensions$initializers
list()
$extensions$catalog_jars
character(0)
$extensions$repositories
character(0)
$spark_home
[1] "/Users/glebvulf/spark/spark-2.4.5-bin-hadoop2.7"
$backend
A connection with
description "->localhost:8881"
class "sockconn"
mode "wb"
text "binary"
opened "opened"
can read "yes"
can write "yes"
$monitoring
A connection with
description "->localhost:8881"
class "sockconn"
mode "wb"
text "binary"
opened "opened"
can read "yes"
can write "yes"
$gateway
A connection with
description "->localhost:8880"
class "sockconn"
mode "rb"
text "binary"
opened "opened"
can read "yes"
can write "yes"
$output_file
[1] "/var/folders/sh/klkdl4gx3llb0q9cv8nywfqh0000gn/T//RtmpxlLiu6/file9b65935ac53_spark.log"
$sessionId
[1] 694
$home_version
[1] "2.4.5"
attr(,"class")
[1] "spark_connection" "spark_shell_connection" "DBIConnection"
library(tidyverse)
library(janitor)
avocado <- read_csv("avocado.csv") %>%
clean_names()
Missing column names filled in: 'X1' [1]Parsed with column specification:
cols(
X1 = [32mcol_double()[39m,
Date = [34mcol_date(format = "")[39m,
AveragePrice = [32mcol_double()[39m,
`Total Volume` = [32mcol_double()[39m,
`4046` = [32mcol_double()[39m,
`4225` = [32mcol_double()[39m,
`4770` = [32mcol_double()[39m,
`Total Bags` = [32mcol_double()[39m,
`Small Bags` = [32mcol_double()[39m,
`Large Bags` = [32mcol_double()[39m,
`XLarge Bags` = [32mcol_double()[39m,
type = [31mcol_character()[39m,
year = [32mcol_double()[39m,
region = [31mcol_character()[39m
)
avocado_spark <- copy_to(sc, avocado)
Warning in doTryCatch(return(expr), name, parentenv, handler) :
restarting interrupted promise evaluation
avocado_spark <- spark_read_csv(sc, path = "avocado.csv")
avocado_spark<- tbl(sc, "avocado")
src_tbls(sc)
[1] "avocado" "prestige"
class(avocado_spark)
[1] "tbl_spark" "tbl_sql" "tbl_lazy" "tbl"
str(avocado_spark)
List of 2
$ src:List of 1
..$ con:List of 13
.. ..$ master : chr "local[4]"
.. ..$ method : chr "shell"
.. ..$ app_name : chr "sparklyr"
.. ..$ config :List of 6
.. .. ..$ spark.env.SPARK_LOCAL_IP.local : chr "127.0.0.1"
.. .. ..$ sparklyr.connect.csv.embedded : chr "^1.*"
.. .. ..$ spark.sql.legacy.utcTimestampFunc.enabled: logi TRUE
.. .. ..$ sparklyr.connect.cores.local : int 4
.. .. ..$ spark.sql.shuffle.partitions.local : int 4
.. .. ..$ sparklyr.shell.driver-memory : chr "2g"
.. .. ..- attr(*, "config")= chr "default"
.. .. ..- attr(*, "file")= chr "/Library/Frameworks/R.framework/Versions/3.6/Resources/library/sparklyr/conf/config-template.yml"
.. ..$ state :<environment: 0x7f7f7ad3e748>
.. ..$ extensions :List of 5
.. .. ..$ jars : chr(0)
.. .. ..$ packages : chr(0)
.. .. ..$ initializers: list()
.. .. ..$ catalog_jars: chr(0)
.. .. ..$ repositories: chr(0)
.. ..$ spark_home : chr "/Users/glebvulf/spark/spark-2.4.5-bin-hadoop2.7"
.. ..$ backend : 'sockconn' int 4
.. .. ..- attr(*, "conn_id")=<externalptr>
.. ..$ monitoring : 'sockconn' int 6
.. .. ..- attr(*, "conn_id")=<externalptr>
.. ..$ gateway : 'sockconn' int 3
.. .. ..- attr(*, "conn_id")=<externalptr>
.. ..$ output_file : chr "/var/folders/sh/klkdl4gx3llb0q9cv8nywfqh0000gn/T//RtmpxlLiu6/file9b65935ac53_spark.log"
.. ..$ sessionId : num 694
.. ..$ home_version: chr "2.4.5"
.. ..- attr(*, "class")= chr [1:3] "spark_connection" "spark_shell_connection" "DBIConnection"
..- attr(*, "class")= chr [1:3] "src_spark" "src_sql" "src"
$ ops:List of 2
..$ x : 'ident' chr "avocado"
..$ vars: chr [1:14] "_c0" "Date" "AveragePrice" "Total_Volume" ...
..- attr(*, "class")= chr [1:3] "op_base_remote" "op_base" "op"
- attr(*, "class")= chr [1:4] "tbl_spark" "tbl_sql" "tbl_lazy" "tbl"
library(pryr)
object_size(avocado)
2.06 MB
object_size(avocado_spark)
58.2 kB
selected_avocado_spark <- avocado_spark %>%
select(AveragePrice) %>%
summarise(av_mean = mean(AveragePrice)) %>%
collect() # collect on its own without summarizing is not a good idea, as there could
#be too much data
#object_size(selected_avocado_spark)
avocado_wo_price <- avocado_spark %>%
select(-AveragePrice) %>%
compute("avocado_wo_price")
avocado_spark %>%
select(AveragePrice) %>%
show_query()
<SQL>
SELECT `AveragePrice`
FROM `avocado`
avocado_spark <- avocado_spark %>%
mutate(high_average = AveragePrice > 1.40) %>%
show_query()
<SQL>
SELECT `_c0`, `Date`, `AveragePrice`, `Total_Volume`, `4046`, `4225`, `4770`, `Total_Bags`, `Small_Bags`, `Large_Bags`, `XLarge_Bags`, `type`, `year`, `region`, `AveragePrice` > 1.4 AS `high_average`
FROM `avocado`
avocado_spark %>%
select(high_average) %>%
glimpse() %>%
show_query()
Rows: ??
Columns: 1
Database: spark_connection
$ high_average [3m[38;5;246m<lgl>[39m[23m FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FA…
<SQL>
SELECT `AveragePrice` > 1.4 AS `high_average`
FROM `avocado`
result <- avocado_spark %>%
mutate(low_average = AveragePrice < 1.0) %>%
show_query()
<SQL>
SELECT `_c0`, `Date`, `AveragePrice`, `Total_Volume`, `4046`, `4225`, `4770`, `Total_Bags`, `Small_Bags`, `Large_Bags`, `XLarge_Bags`, `type`, `year`, `region`, `high_average`, `AveragePrice` < 1.0 AS `low_average`
FROM (SELECT `_c0`, `Date`, `AveragePrice`, `Total_Volume`, `4046`, `4225`, `4770`, `Total_Bags`, `Small_Bags`, `Large_Bags`, `XLarge_Bags`, `type`, `year`, `region`, `AveragePrice` > 1.4 AS `high_average`
FROM `avocado`) `dbplyr_016`
collected_result <- result %>%
collect()
class(result)
[1] "tbl_spark" "tbl_sql" "tbl_lazy" "tbl"
class(collected_result)
[1] "tbl_df" "tbl" "data.frame"
Spark web interface
spark_web(sc)
library(sparklyr)
sc <- spark_connect(master = "local", version = "2.4.5")
Re-using existing Spark connection to local
library(dbplot)
library(car)
prestige <- copy_to(sc, Prestige)
prestige %>%
summarise_if(is.numeric, mean) %>%
show_query()
Applying predicate on the first 100 rows
<SQL>
SELECT AVG(`education`) AS `education`, AVG(`income`) AS `income`, AVG(`women`) AS `women`, AVG(`prestige`) AS `prestige`, AVG(`census`) AS `census`
FROM `Prestige`
prestige %>%
summarise_if(is.numeric, var) %>%
show_query()
Applying predicate on the first 100 rows
<SQL>
SELECT var_samp(`education`) AS `education`, var_samp(`income`) AS `income`, var_samp(`women`) AS `women`, var_samp(`prestige`) AS `prestige`, var_samp(`census`) AS `census`
FROM `Prestige`
prestige %>%
mutate(secondary_educated = ifelse(education > 7, "Yes", "No")) %>%
group_by(secondary_educated) %>%
summarise(mean_income = mean(income)) %>%
show_query()
<SQL>
SELECT `secondary_educated`, AVG(`income`) AS `mean_income`
FROM (SELECT `education`, `income`, `women`, `prestige`, `census`, `type`, CASE WHEN (`education` > 7.0) THEN ("Yes") WHEN NOT(`education` > 7.0) THEN ("No") END AS `secondary_educated`
FROM `Prestige`) `dbplyr_019`
GROUP BY `secondary_educated`
prestige %>%
select(income, education) %>%
glimpse()
Rows: ??
Columns: 2
Database: spark_connection
$ income [3m[38;5;246m<int>[39m[23m 12351, 25879, 9271, 8865, 8403, 11030, 8258, 14163, 11377, 11023, 59…
$ education [3m[38;5;246m<dbl>[39m[23m 13.11, 12.26, 12.77, 11.42, 14.62, 15.64, 15.09, 15.44, 14.52, 14.64…
tryCatch(
{prestige[, c("education", "income")] %>% glimpse()},
error = print
)
<simpleError in prestige[, c("education", "income")]: incorrect number of dimensions>
tryCatch(
{Prestige[, c("education", "income")] %>% glimpse()},
error = print
)
Rows: 102
Columns: 2
$ education [3m[38;5;246m<dbl>[39m[23m 13.11, 12.26, 12.77, 11.42, 14.62, 15.64, 15.09, 15.44, 14.52, 14.64…
$ income [3m[38;5;246m<int>[39m[23m 12351, 25879, 9271, 8865, 8403, 11030, 8258, 14163, 11377, 11023, 59…
Passthrogh
prestige %>%
summarise(women_percentile = percentile(women, array(0.25, 0.5, 0.75))) %>%
mutate(women_percentile = explode(women_percentile)) %>%
show_query()
<SQL>
SELECT explode(`women_percentile`) AS `women_percentile`
FROM (SELECT percentile(`women`, array(0.25, 0.5, 0.75)) AS `women_percentile`
FROM `Prestige`) `dbplyr_021`
Raster Plot
schema
$education
$education$name
[1] "education"
$education$type
[1] "DoubleType"
$income
$income$name
[1] "income"
$income$type
[1] "IntegerType"
$women
$women$name
[1] "women"
$women$type
[1] "DoubleType"
$prestige
$prestige$name
[1] "prestige"
$prestige$type
[1] "DoubleType"
$census
$census$name
[1] "census"
$census$type
[1] "IntegerType"
$type
$type$name
[1] "type"
$type$type
[1] "StringType"
SDF sdf_… functions - Spark DataFrame
Sorting
Sampling
Partitioning (e.g. train/test split)
Binding
ls("package:sparklyr", pattern = "^ft" )
[1] "ft_binarizer" "ft_bucketed_random_projection_lsh"
[3] "ft_bucketizer" "ft_chisq_selector"
[5] "ft_count_vectorizer" "ft_dct"
[7] "ft_discrete_cosine_transform" "ft_dplyr_transformer"
[9] "ft_elementwise_product" "ft_feature_hasher"
[11] "ft_hashing_tf" "ft_idf"
[13] "ft_imputer" "ft_index_to_string"
[15] "ft_interaction" "ft_max_abs_scaler"
[17] "ft_min_max_scaler" "ft_minhash_lsh"
[19] "ft_ngram" "ft_normalizer"
[21] "ft_one_hot_encoder" "ft_one_hot_encoder_estimator"
[23] "ft_pca" "ft_polynomial_expansion"
[25] "ft_quantile_discretizer" "ft_r_formula"
[27] "ft_regex_tokenizer" "ft_sql_transformer"
[29] "ft_standard_scaler" "ft_stop_words_remover"
[31] "ft_string_indexer" "ft_string_indexer_model"
[33] "ft_tokenizer" "ft_vector_assembler"
[35] "ft_vector_indexer" "ft_vector_slicer"
[37] "ft_word2vec"
MLlib ft_… - Feature Transformers ml_… - Machine Learning
prestige_again <- spark_read_parquet(sc, name = "prestige_again", path = "prestige_data") %>%
glimpse()
Rows: ??
Columns: 6
Database: spark_connection
$ education [3m[38;5;246m<dbl>[39m[23m 13.11, 12.26, 12.77, 11.42, 14.62, 15.64, 15.09, 15.44, 14.52, 14.64…
$ income [3m[38;5;246m<int>[39m[23m 12351, 25879, 9271, 8865, 8403, 11030, 8258, 14163, 11377, 11023, 59…
$ women [3m[38;5;246m<dbl>[39m[23m 11.16, 4.02, 15.70, 9.11, 11.68, 5.13, 25.65, 2.69, 1.03, 0.94, 1.91…
$ prestige [3m[38;5;246m<dbl>[39m[23m 68.8, 69.1, 63.4, 56.8, 73.5, 77.6, 72.6, 78.1, 73.1, 68.8, 62.0, 60…
$ census [3m[38;5;246m<int>[39m[23m 1113, 1130, 1171, 1175, 2111, 2113, 2133, 2141, 2143, 2153, 2161, 21…
$ type [3m[38;5;246m<chr>[39m[23m "prof", "prof", "prof", "prof", "prof", "prof", "prof", "prof", "pro…
glimpse(profiles)
Rows: ??
Columns: 31
Database: spark_connection
$ age [3m[38;5;246m<int>[39m[23m 47, 27, 45, 40, 33, 27, 20, 28, 24, 34, 32, 23, 19, 45, 32, 38, 36…
$ body_type [3m[38;5;246m<chr>[39m[23m "athletic", "full figured", NA, "athletic", "athletic", "thin", "c…
$ diet [3m[38;5;246m<chr>[39m[23m "mostly anything", "mostly vegetarian", NA, NA, "mostly anything",…
$ drinks [3m[38;5;246m<chr>[39m[23m "socially", "socially", "socially", "socially", "rarely", "sociall…
$ drugs [3m[38;5;246m<chr>[39m[23m "never", NA, "never", "never", "never", "sometimes", "never", "nev…
$ education [3m[38;5;246m<chr>[39m[23m "graduated from college/university", "graduated from college/unive…
$ essay0 [3m[38;5;246m<chr>[39m[23m NA, "still figuring out what to put here... for now i'll leave you…
$ essay1 [3m[38;5;246m<chr>[39m[23m "working in a creative industry (brand marketing agency), helping …
$ essay2 [3m[38;5;246m<chr>[39m[23m "listening to people, hearing them and working to help them. it's\…
$ essay3 [3m[38;5;246m<chr>[39m[23m "my million dollar smile (self-deprecating humor).", "my smile, my…
$ essay4 [3m[38;5;246m<chr>[39m[23m "sadly, my reading is confined to newspapers and business\nperiodi…
$ essay5 [3m[38;5;246m<chr>[39m[23m "my daughter. usa today. exercise. some travel. world news. ice\nc…
$ essay6 [3m[38;5;246m<chr>[39m[23m NA, NA, "why i fill these blocks.<br />\nwhat i really, really wan…
$ essay7 [3m[38;5;246m<chr>[39m[23m "picking up my daughter for the weekend or going to the gym and\nc…
$ essay8 [3m[38;5;246m<chr>[39m[23m "i can be a procrastinator but it is often driven by my need to\nr…
$ essay9 [3m[38;5;246m<chr>[39m[23m "you are looking for an easy going, no drama, try anything once\na…
$ ethnicity [3m[38;5;246m<chr>[39m[23m "white", "white", "other", "white", "white", "hispanic / latin, wh…
$ height [3m[38;5;246m<int>[39m[23m 72, 63, 66, 62, 74, 71, 68, 66, 62, 67, 61, 59, 68, 68, 67, 73, 75…
$ income [3m[38;5;246m<int>[39m[23m -1, -1, -1, -1, -1, -1, 20000, -1, -1, -1, 70000, -1, -1, -1, -1, …
$ job [3m[38;5;246m<chr>[39m[23m "sales / marketing / biz dev", "science / tech / engineering", "me…
$ last_online [3m[38;5;246m<chr>[39m[23m "2012-06-29-22-37", "2012-06-30-12-50", "2012-06-30-18-21", "2012-…
$ location [3m[38;5;246m<chr>[39m[23m "san carlos, california", "oakland, california", "san francisco, c…
$ offspring [3m[38;5;246m<chr>[39m[23m "has a kid, but doesn’t want more", "doesn’t have kids…
$ orientation [3m[38;5;246m<chr>[39m[23m "straight", "straight", "straight", "straight", "gay", "gay", "str…
$ pets [3m[38;5;246m<chr>[39m[23m "likes dogs", "likes dogs", "likes dogs and likes cats", "has dogs…
$ religion [3m[38;5;246m<chr>[39m[23m "other", "atheism but not too serious about it", "catholicism but …
$ sex [3m[38;5;246m<chr>[39m[23m "m", "f", "m", "f", "m", "m", "f", "f", "f", "f", "f", "f", "m", "…
$ sign [3m[38;5;246m<chr>[39m[23m "scorpio and it’s fun to think about", "sagittarius but it d…
$ smokes [3m[38;5;246m<chr>[39m[23m "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", …
$ speaks [3m[38;5;246m<chr>[39m[23m "english", "english", "english (fluently)", "english", "english", …
$ status [3m[38;5;246m<chr>[39m[23m "single", "single", "single", "single", "single", "single", "singl…
profiles %>%
summarise_all(.funs = ~sum(as.integer(is.na(.))))
Missing values are always removed in SQL.
Use `SUM(x, na.rm = TRUE)` to silence this warning
This warning is displayed only once per session.
profiles_char <- profiles %>%
select(-c(age, income, height)) %>%
mutate_all(~ ifelse(is.na(.), "missing", .))
profiles_num <- profiles %>%
select(age, income, height) %>%
mutate(
age = as.numeric(age),
income = ifelse(income == "-1", NA, as.numeric(income)),
height = as.numeric(height)
)
profiles <- sdf_bind_cols(profiles_char, profiles_num) %>%
compute("profiles")
Machine Learning model
profiles <- profiles %>%
mutate(
not_working = as.integer(ifelse(job %in% c("student", "unemployed", "retured"),1,0))
)
profiles %>%
count(not_working)
Categorical encoding ft_string_indexer() ft_one_hot_encoder() to make dummies
profiles <- profiles %>%
ft_string_indexer(
input_col = "drinks",
output_col = "drinks_indexed"
) %>%
ft_one_hot_encoder(
input_col = "drinks_indexed",
output_col = "drinks_encoded"
) %>%
ft_string_indexer(
input_col = "drugs",
output_col = "drugs_indexed"
) %>%
ft_one_hot_encoder(
input_col = "drugs_indexed",
output_col = "drugs_encoded"
) %>%
ft_string_indexer(
input_col = "status",
output_col = "status_indexed"
) %>%
ft_one_hot_encoder(
input_col = "status_indexed",
output_col = "status_encoded"
) %>%
compute("profiles")
glimpse(profiles)
Rows: ??
Columns: 38
Database: spark_connection
$ body_type [3m[38;5;246m<chr>[39m[23m "athletic", "full figured", "missing", "athletic", "athletic", …
$ diet [3m[38;5;246m<chr>[39m[23m "mostly anything", "mostly vegetarian", "missing", "missing", "…
$ drinks [3m[38;5;246m<chr>[39m[23m "socially", "socially", "socially", "socially", "rarely", "soci…
$ drugs [3m[38;5;246m<chr>[39m[23m "never", "missing", "never", "never", "never", "sometimes", "ne…
$ education [3m[38;5;246m<chr>[39m[23m "graduated from college/university", "graduated from college/un…
$ essay0 [3m[38;5;246m<chr>[39m[23m "missing", "still figuring out what to put here... for now i'll…
$ essay1 [3m[38;5;246m<chr>[39m[23m "working in a creative industry (brand marketing agency), helpi…
$ essay2 [3m[38;5;246m<chr>[39m[23m "listening to people, hearing them and working to help them. it…
$ essay3 [3m[38;5;246m<chr>[39m[23m "my million dollar smile (self-deprecating humor).", "my smile,…
$ essay4 [3m[38;5;246m<chr>[39m[23m "sadly, my reading is confined to newspapers and business\nperi…
$ essay5 [3m[38;5;246m<chr>[39m[23m "my daughter. usa today. exercise. some travel. world news. ice…
$ essay6 [3m[38;5;246m<chr>[39m[23m "missing", "missing", "why i fill these blocks.<br />\nwhat i r…
$ essay7 [3m[38;5;246m<chr>[39m[23m "picking up my daughter for the weekend or going to the gym and…
$ essay8 [3m[38;5;246m<chr>[39m[23m "i can be a procrastinator but it is often driven by my need to…
$ essay9 [3m[38;5;246m<chr>[39m[23m "you are looking for an easy going, no drama, try anything once…
$ ethnicity [3m[38;5;246m<chr>[39m[23m "white", "white", "other", "white", "white", "hispanic / latin,…
$ job [3m[38;5;246m<chr>[39m[23m "sales / marketing / biz dev", "science / tech / engineering", …
$ last_online [3m[38;5;246m<chr>[39m[23m "2012-06-29-22-37", "2012-06-30-12-50", "2012-06-30-18-21", "20…
$ location [3m[38;5;246m<chr>[39m[23m "san carlos, california", "oakland, california", "san francisco…
$ offspring [3m[38;5;246m<chr>[39m[23m "has a kid, but doesn’t want more", "doesn’t have k…
$ orientation [3m[38;5;246m<chr>[39m[23m "straight", "straight", "straight", "straight", "gay", "gay", "…
$ pets [3m[38;5;246m<chr>[39m[23m "likes dogs", "likes dogs", "likes dogs and likes cats", "has d…
$ religion [3m[38;5;246m<chr>[39m[23m "other", "atheism but not too serious about it", "catholicism b…
$ sex [3m[38;5;246m<chr>[39m[23m "m", "f", "m", "f", "m", "m", "f", "f", "f", "f", "f", "f", "m"…
$ sign [3m[38;5;246m<chr>[39m[23m "scorpio and it’s fun to think about", "sagittarius but i…
$ smokes [3m[38;5;246m<chr>[39m[23m "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", "no…
$ speaks [3m[38;5;246m<chr>[39m[23m "english", "english", "english (fluently)", "english", "english…
$ status [3m[38;5;246m<chr>[39m[23m "single", "single", "single", "single", "single", "single", "si…
$ age [3m[38;5;246m<dbl>[39m[23m 47, 27, 45, 40, 33, 27, 20, 28, 24, 34, 32, 23, 19, 45, 32, 38,…
$ income [3m[38;5;246m<dbl>[39m[23m NaN, NaN, NaN, NaN, NaN, NaN, 20000, NaN, NaN, NaN, 70000, NaN,…
$ height [3m[38;5;246m<dbl>[39m[23m 72, 63, 66, 62, 74, 71, 68, 66, 62, 67, 61, 59, 68, 68, 67, 73,…
$ not_working [3m[38;5;246m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ drinks_indexed [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 4, …
$ drinks_encoded [3m[38;5;246m<list>[39m[23m [<1, 0, 0, 0, 0, 0>, <1, 0, 0, 0, 0, 0>, <1, 0, 0, 0, 0, 0>, <…
$ drugs_indexed [3m[38;5;246m<dbl>[39m[23m 0, 1, 0, 0, 0, 2, 0, 0, 0, 1, 0, 2, 0, 2, 0, 0, 0, 1, 0, 1, 0, …
$ drugs_encoded [3m[38;5;246m<list>[39m[23m [<1, 0, 0>, <0, 1, 0>, <1, 0, 0>, <1, 0, 0>, <1, 0, 0>, <0, 0,…
$ status_indexed [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, …
$ status_encoded [3m[38;5;246m<list>[39m[23m [<1, 0, 0, 0>, <1, 0, 0, 0>, <1, 0, 0, 0>, <1, 0, 0, 0>, <1, 0…
Train Test splitting
partitioned <- profiles %>%
sdf_random_split(training = 0.7, testing = 0.3, seed = 42)
training <- partitioned$training %>%
compute("training")
testing <- partitioned$testing %>%
compute("testing")
scale Age
Age - mean, sd
training <- training %>%
mutate(
scaled_age = (age - !!scaling_values$mean_age) / !!scaling_values$sd_age
) %>%
glimpse()
Rows: ??
Columns: 39
Database: spark_connection
$ body_type [3m[38;5;246m<chr>[39m[23m "a little extra", "a little extra", "a little extra", "a little…
$ diet [3m[38;5;246m<chr>[39m[23m "anything", "anything", "anything", "anything", "anything", "an…
$ drinks [3m[38;5;246m<chr>[39m[23m "rarely", "socially", "socially", "socially", "socially", "soci…
$ drugs [3m[38;5;246m<chr>[39m[23m "never", "never", "never", "never", "never", "never", "never", …
$ education [3m[38;5;246m<chr>[39m[23m "graduated from masters program", "dropped out of college/unive…
$ essay0 [3m[38;5;246m<chr>[39m[23m "hello.... well this is my 1st time on an internet dating site.…
$ essay1 [3m[38;5;246m<chr>[39m[23m "working hard, so i can play harder!!!", "trying to find out wh…
$ essay2 [3m[38;5;246m<chr>[39m[23m "trying lots of new things :) making new friends.", "listening …
$ essay3 [3m[38;5;246m<chr>[39m[23m "that i'm easy going. non-high maintenance.", "my hair", "my cr…
$ essay4 [3m[38;5;246m<chr>[39m[23m "i haven't watched tv for about 2 yrs now.... well, except when…
$ essay5 [3m[38;5;246m<chr>[39m[23m "family, friends, food, water, shelter... is there a 6th i have…
$ essay6 [3m[38;5;246m<chr>[39m[23m "what's in my future.... the next challenge....", "about the fu…
$ essay7 [3m[38;5;246m<chr>[39m[23m "working, or trying to sleep b/c i might get called back to wor…
$ essay8 [3m[38;5;246m<chr>[39m[23m "hmmmm..... i can't stand piles of dirty laundry or dirty dishe…
$ essay9 [3m[38;5;246m<chr>[39m[23m "you meet at least some of the stated interests, or your are wi…
$ ethnicity [3m[38;5;246m<chr>[39m[23m "asian", "hispanic / latin", "white", "white", "white", "hispan…
$ job [3m[38;5;246m<chr>[39m[23m "medicine / health", "other", "computer / hardware / software",…
$ last_online [3m[38;5;246m<chr>[39m[23m "2012-06-30-16-12", "2012-06-22-08-43", "2012-06-13-12-41", "20…
$ location [3m[38;5;246m<chr>[39m[23m "oakland, california", "hayward, california", "emeryville, cali…
$ offspring [3m[38;5;246m<chr>[39m[23m "doesn’t have kids, and doesn’t want any", "doesn&r…
$ orientation [3m[38;5;246m<chr>[39m[23m "straight", "straight", "straight", "gay", "bisexual", "straigh…
$ pets [3m[38;5;246m<chr>[39m[23m "likes dogs", "has dogs", "likes dogs and likes cats", "has dog…
$ religion [3m[38;5;246m<chr>[39m[23m "missing", "catholicism but not too serious about it", "other a…
$ sex [3m[38;5;246m<chr>[39m[23m "f", "m", "m", "m", "m", "m", "f", "m", "m", "m", "f", "m", "m"…
$ sign [3m[38;5;246m<chr>[39m[23m "leo", "leo and it’s fun to think about", "libra but it d…
$ smokes [3m[38;5;246m<chr>[39m[23m "missing", "sometimes", "no", "no", "no", "no", "missing", "som…
$ speaks [3m[38;5;246m<chr>[39m[23m "english", "english (fluently), spanish (okay)", "english (flue…
$ status [3m[38;5;246m<chr>[39m[23m "single", "single", "single", "single", "available", "single", …
$ age [3m[38;5;246m<dbl>[39m[23m 44, 25, 26, 32, 45, 32, 26, 20, 26, 22, 23, 28, 39, 59, 55, 45,…
$ income [3m[38;5;246m<dbl>[39m[23m NaN, 2e+04, 3e+04, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN,…
$ height [3m[38;5;246m<dbl>[39m[23m 67, 71, 74, 68, 67, 67, 63, 68, 66, 75, 65, 69, 72, 65, 72, 72,…
$ not_working [3m[38;5;246m<int>[39m[23m 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ drinks_indexed [3m[38;5;246m<dbl>[39m[23m 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 3, 3, 3, 3, 3, 3, 2, 2, …
$ drinks_encoded [3m[38;5;246m<list>[39m[23m [<0, 1, 0, 0, 0, 0>, <1, 0, 0, 0, 0, 0>, <1, 0, 0, 0, 0, 0>, <…
$ drugs_indexed [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 2, 0, 0, 1, 0, 0, 0, 3, 2, 1, 0, …
$ drugs_encoded [3m[38;5;246m<list>[39m[23m [<1, 0, 0>, <1, 0, 0>, <1, 0, 0>, <1, 0, 0>, <1, 0, 0>, <1, 0,…
$ status_indexed [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, …
$ status_encoded [3m[38;5;246m<list>[39m[23m [<1, 0, 0, 0>, <1, 0, 0, 0>, <1, 0, 0, 0>, <1, 0, 0, 0>, <0, 0…
$ scaled_age [3m[38;5;246m<dbl>[39m[23m 1.27623744, -0.77314810, -0.66528570, -0.01811132, 1.38409983, …
Model: not_working ~ scaled_age, drinks, drugs, status
validation_info <- ml_evaluate(logreg_model, training)
validation_info
BinaryLogisticRegressionSummaryImpl
Access the following via `$` or `ml_summary()`.
- features_col()
- label_col()
- predictions()
- probability_col()
- area_under_roc()
- f_measure_by_threshold()
- pr()
- precision_by_threshold()
- recall_by_threshold()
- roc()
- prediction_col()
- accuracy()
- f_measure_by_label()
- false_positive_rate_by_label()
- labels()
- precision_by_label()
- recall_by_label()
- true_positive_rate_by_label()
- weighted_f_measure()
- weighted_false_positive_rate()
- weighted_precision()
- weighted_recall()
- weighted_true_positive_rate()
roc <- validation_info$roc() %>%
collect() %>%
glimpse()
Rows: 110
Columns: 2
$ FPR [3m[38;5;246m<dbl>[39m[23m 0.000000000, 0.002642008, 0.007265522, 0.011492734, 0.016776750, 0.0196829…
$ TPR [3m[38;5;246m<dbl>[39m[23m 0.00000000, 0.03005464, 0.07103825, 0.09699454, 0.15300546, 0.16530055, 0.…
ggplot(roc, aes(x = FPR, y = TPR)) +
geom_line() +
geom_abline(lty = "dashed") +
coord_fixed()